{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b7e5243d-8ee8-4bf1-8c00-a15259f81f59",
   "metadata": {},
   "source": [
    "# Import packages and make a connection to Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "31170adc-ca9d-4d74-9441-89ed84855fea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch, helpers, exceptions\n",
    "import json\n",
    "import time,os\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "openai_api_key=os.getenv('OPENAI_API_KEY')\n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@localhost:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    "\n",
    "print(client.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "226c0500-56b2-462e-9453-e6a589dba955",
   "metadata": {},
   "source": [
    "# Create the inference task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dbfc57b6-99e6-4659-9a56-8844b3f2ddc4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'model_id': 'my_openai_embedding_model', 'task_type': 'text_embedding', 'service': 'openai', 'service_settings': {'similarity': 'dot_product', 'dimensions': 1536}, 'task_settings': {'model': 'text-embedding-ada-002'}})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.inference.put_model(\n",
    "    task_type=\"text_embedding\",\n",
    "    model_id=\"my_openai_embedding_model\",\n",
    "    body={\n",
    "        \"service\": \"openai\",\n",
    "        \"service_settings\": {\n",
    "            \"api_key\": openai_api_key\n",
    "        },\n",
    "        \"task_settings\": {\n",
    "            \"model\": \"text-embedding-ada-002\"\n",
    "        }\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5850414-ac92-4e57-aeb0-c687b723d458",
   "metadata": {},
   "source": [
    "# Create an ingest pipeline with an inference processor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "396155f7-e358-4665-b920-985473604d91",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.ingest.put_pipeline(\n",
    "    id=\"openai_embeddings_pipeline\", \n",
    "    description=\"Ingest pipeline for OpenAI inference.\",\n",
    "    processors=[\n",
    "    {\n",
    "      \"inference\": {\n",
    "        \"model_id\": \"my_openai_embedding_model\",\n",
    "        \"input_output\": {\n",
    "              \"input_field\": \"plot\",\n",
    "              \"output_field\": \"plot_embedding\"\n",
    "            }\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62f2cdb8-e51f-4bd3-8138-dba7d2043509",
   "metadata": {},
   "source": [
    "# Create index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cfa13d4d-ee3e-4c84-83cb-a4bfe4654ffc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'openai-movie-embeddings'})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client.indices.delete(index=\"openai-movie-embeddings\", ignore_unavailable=True)\n",
    "client.indices.create(\n",
    "  index=\"openai-movie-embeddings\",\n",
    "  settings={\n",
    "      \"index\": {\n",
    "          \"default_pipeline\": \"openai_embeddings_pipeline\"\n",
    "      }\n",
    "  },\n",
    "  mappings={\n",
    "    \"properties\": {\n",
    "      \"plot_embedding\": { \n",
    "        \"type\": \"dense_vector\", \n",
    "        \"dims\": 1536, \n",
    "        \"similarity\": \"dot_product\" \n",
    "      },\n",
    "      \"plot\": {\n",
    "        \"type\": \"text\"\n",
    "        }\n",
    "      }\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50a1f040-8853-4e5e-bee4-a5c3dc72011e",
   "metadata": {},
   "source": [
    "# Insert Documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b6265eac-889a-4b00-914b-af18fcf64264",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done indexing documents into `openai-movie-embeddings` index!\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import helpers\n",
    " \n",
    "with open('movies.json') as f:\n",
    "   data_json = json.load(f)\n",
    "\n",
    "# Prepare the documents to be indexed\n",
    "documents = []\n",
    "for doc in data_json:\n",
    "    documents.append({\n",
    "        \"_index\": \"openai-movie-embeddings\",\n",
    "        \"_source\": doc,\n",
    "    })\n",
    "\n",
    "# Use helpers.bulk to index\n",
    "helpers.bulk(client, documents)\n",
    "\n",
    "print(\"Done indexing documents into `openai-movie-embeddings` index!\")\n",
    "time.sleep(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82ed1a03-4aad-4fd4-82e2-670eddf3db98",
   "metadata": {},
   "source": [
    "# Semantic search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e6176296-f4a8-4d66-80ee-c592bcc57cf3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.91674197\n",
      "Title: Fight Club\n",
      "Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.\n",
      "\n",
      "Score: 0.9069591\n",
      "Title: Pulp Fiction\n",
      "Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.\n",
      "\n",
      "Score: 0.8992071\n",
      "Title: The Dark Knight\n",
      "Plot: When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "response = client.search(\n",
    "    index='openai-movie-embeddings', \n",
    "    size=3,\n",
    "    knn={\n",
    "        \"field\": \"plot_embedding\",\n",
    "        \"query_vector_builder\": {\n",
    "            \"text_embedding\": {\n",
    "                \"model_id\": \"my_openai_embedding_model\",\n",
    "                \"model_text\": \"Fighting movie\"\n",
    "            }\n",
    "        },\n",
    "        \"k\": 10,\n",
    "        \"num_candidates\": 100\n",
    "        }\n",
    ")\n",
    "\n",
    "for hit in response['hits']['hits']:\n",
    "    doc_id = hit['_id']\n",
    "    score = hit['_score']\n",
    "    title = hit['_source']['title']\n",
    "    plot = hit['_source']['plot']\n",
    "    print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79e05765-708c-4c64-8e71-2e3cb1df6b63",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
